# Stack store x group files into store files 
# JHL 

# Clean environment and initialize
rm(list = ls(all = TRUE))
# Packages 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign", "dplyr", "ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

### load necessary package ###
require(folderfun)
require("data.table")
require("lubridate")
require("parallel")

# Setup paths
setff("path_home")
path_home <- ffpath_home()

do_path <- sprintf("%s/do/data_prep/01_price_index",path_home)
repository <- sprintf('%s/dta/nielsen/PIq_06_15/PI_by_Store',path_home)
savepath <- paste0(path_home,'/dta/nielsen/PIq_06_15/storePI_stacked/PI_%s.csv')

# Time printing function 
verbose.Print.Time <- function(verbose, msg=NA, prev.time=NA) {
    cur.time <- proc.time()[3]
    if (verbose) {
        if (is.na(prev.time)) { # no time to print w/o prev.time 
            cat(sprintf("%s", msg))
        } else {
            if (is.na(msg)) { # if no message, end of timing, just print time
                cat(sprintf("(%1.0fs), ", cur.time-prev.time))
            } else { # usual case, print time since last message and message
                cat(sprintf("(%1.0fs); %s ", cur.time-prev.time, msg))
            }
        }    
    }
    return(cur.time)
}

cat("getting the file path")
cat('\n')

# Collapse_opt: default (F, store-level)
stack <- function(collapse_opt=F) {

	location <- {}
	PI_VUL <- {}
	output <- {}
	cores <- 8

	# Format: fips_productgp_storecode.csv 

	 # Create RData data table of file names and store codes 
	 if (!file.exists(sprintf("%s/PIq_06_15_files.RData",do_path))) {

		 # Much more efficient listing method 
		 ptm <- proc.time()
		 setwd(repository)
		 file.table <- system("ls -f -R", intern = TRUE)
		 file.table <- file.table[-c(1,2,3)]
		 file.table <- as.list(file.table)
		 file.table <- lapply(file.table, function(x) paste(repository,"/",x,sep=""))

		 # file_name is state_group_storecode.csv
		 temp = lapply(file.table, function(x) strsplit(x, split = "_")) # Split list of file names based on underscore 
		 store_codes = lapply(temp, function(x) strsplit(x[[1]][[length(x[[1]])]], split = "[.]")) # Split further based on . 
		 store_codes = lapply(store_codes, function(x) x[[1]][[1]]) # Obtain the store code 
		 file.table = data.table(file.table, as.numeric(unlist(store_codes))) # Combine file names with store code into a data table 
		 names(file.table) <- c("file_name","store_codes") # Name data table 
		 save(file.table, file = sprintf("%s/PIq_06_15_files.RData",do_path)) # Save to RData for quick loading	 
		 proc.time() - ptm

	 }	
	 
	if(collapse_opt == F) {
		load(sprintf("%s/PIq_06_15_files.RData",do_path))
	}
 
	 
	# Get the ID from the slurm array to have a variable name
	# Naturally string, if use numerical value transform into numeric 
	myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

	verbose = T 

	prev.time <- verbose.Print.Time(verbose, "List locations")

	setkey(file.table,store_codes)

	quant = quantile(unique(file.table$store_codes),seq(0,1,by=0.01))
	file.table = file.table[store_codes>=quant[[myID]] & store_codes<=quant[[myID+1]],]


	i=0
	for (store in unique(file.table$store_codes)) {
		i=i+1 
		location[[i]] <- file.table[.(store),]
	}

	names(location) <- unique(file.table$store_codes)


	### Read all the csv files by store into the list and then stack them 

	prev.time <- verbose.Print.Time(verbose, "Bind stores", prev.time)

	i=0
	for (store in names(location)) {
		i=i+1 
		cat(sprintf("Reading: %s store", store))
		PI_VUL[[i]] <- lapply(location[[i]]$file_name, fread)
		
		# Take out group from file name and place it into the data
		groups = lapply(location[[i]]$file_name, function(x) strsplit(x, split = "_")) # Split list of file names based on underscore 
		# Sample: /scratch/midway2/jleung/PIq_06_15/PI_by_Store/1_4511_1001.csv 
		groups = lapply(groups, function(x) strsplit(x[[1]][[length(x[[1]])-1]], split = "[.]")) # Split further based on .
		groups = lapply(groups, function(x) x[[1]][[1]]) # Obtain group
		groups = as.numeric(groups)
		lapply(1:length(groups), function(j) PI_VUL[[i]][[j]][, group := groups[j]])
		
		cat(sprintf("Stacking: %s store", store))
		cat('\n')
		output[[i]] <- do.call("rbind", c(PI_VUL[[i]],fill=T) )
		# New, add store code into output
		output[[i]][,store_code_uc:=store]
		
	}

	names(output) <- unique(file.table$store_codes)

	### let's save the output using mclapply (parallelized version of lapply)

	cat("saving")
	cat('\n')

	prev.time <- verbose.Print.Time(verbose, "Save bound files",prev.time)

	mclapply(names(output), function(i) {
			write.csv(output[[i]], file =sprintf(savepath, i), 
					 row.names = FALSE)}, mc.set.seed = TRUE, mc.cores = cores) 

	prev.time <- verbose.Print.Time(verbose, "Done stacking",prev.time)

}

stack(F)



